## Cohort simulation model 
## Cohorts by age, income, wealth, bequest received, homeowner

## This file draws together initial cohort population data and all parameters. 
## Then creates a list of data frames, one data frame per year, with each containing the relevant parameters for each cohort in that year. 
## Resulting lists for the base scenario and historic returns scenario are saved in the ‘Output data’ folder. 


# Preliminaries -----------------------------------------------------------

rm(list=ls())
gc()

memory.limit(120000)

## ensure packages are loaded
source("./R scripts/Master package loading.R")


# Read necessary data -----------------------------------------------------

starting_cohorts <- qread("./Input data/starting_cohorts_aiwbh_p.qs") 

income_tax_super_projection <- qread( "./Input data/income_tax_super_projection_ai.qs") 

income_transitions <- qread("./Input data/income_transition_probs.qs") 

mortality_rates <- qread("./Input data/mortality_rates_ay.qs") 

bequest_receipt_prob <- qread("./Input data/bequest_receipt_prob_aiwbh_yp.qs") 

home_transition_prob <- qread("./Input data/housing_transition_probs_aiy.qs") 

wealth_accum_params <- qread("./Input data/wealth_accum_params_aiwbh.qs") 

gifting_rates <- qread("./Input data/gift_giving_rate_a.qs") 


# Additional model parameters --------------------------------------------------------

inc_grps <- uniqueN(starting_cohorts$total_inc_qtile)
wlth_grps <- uniqueN(starting_cohorts$total_wealth_qtile)
age_grps_n <- uniqueN(starting_cohorts$age_grp)

age_grp_concord <- distinct(starting_cohorts, age_grp, age_grp2)

projection_years <- c(2018:2050) 
projection_years_id <- c(0: (length(projection_years)-1))

year_concord <- data.frame(year=projection_years_id,
                           year_actual=projection_years)

## year interval between income transitions
inc_trans_interval <- 5

## how many income transitions within projection period - 6
inc_trans_n <- floor (max(projection_years_id) / inc_trans_interval )

## ages where incomes are allowed to transition at the 5 year mark (ie age they are after they have transitioned)
inc_trans_ages <- data.frame(age_grp=unique(starting_cohorts$age_grp)) %>% 
  mutate(inc_trans_possible=  case_when(
    as.numeric(age_grp) %in% c(which(levels(starting_cohorts$age_grp)=="[15,20)") : which(levels(starting_cohorts$age_grp)=="[35,40)") ) ~ 1,
    as.numeric(age_grp) %in% c(which(levels(starting_cohorts$age_grp)=="[60,65)") : which(levels(starting_cohorts$age_grp)=="[70,75)") ) ~ 1,
    TRUE ~ 0
  ) %>% as.logical
  )

## ages where transitions into home ownership are possible
home_trans_ages <- home_transition_prob %>% 
  ungroup %>% 
  filter(home_trans_prob>0) %>% 
  distinct(age_grp) %>% 
  mutate(home_trans_possible=T) %>% 
  right_join(age_grp_concord) %>% 
  mutate(home_trans_possible = ifelse(is.na(home_trans_possible), F, home_trans_possible)) %>% 
  arrange(age_grp) %>% 
  filter(home_trans_possible==T)


# Quality of life edits to pulled in data ---------------------------------------------------

starting_cohorts <- starting_cohorts %>% 
  rename(housing_assets = housing_assets_av_smooth,
         super_assets = super_assets_av_smooth,
         other_assets = other_assets_av_smooth,
         housing_debt = housing_debt_av_smooth,
         
         wages_inc = wages_av_smooth,
         other_inc = other_inc_av_smooth) %>% 
  mutate(starting_cohort = paste0("SA", age_grp, " SI", total_inc_qtile, " SW", total_wealth_qtile, " SB", bequest_received, " SH", homeowner))


income_tax_super_projection_qol <- income_tax_super_projection %>% 
  left_join(year_concord, by=c("year"="year_actual") ) %>% 
  select(-year, year=year.y) %>% 
  filter(year %in% projection_years_id) %>% 
  mutate(netincome = income - income_tax - medicare_levy) %>% 
  select(age_grp2, total_inc_qtile, income, netincome, marginal_tax, super_contrib, year)


mortality_rates_qol <- mortality_rates %>% 
  rename(death_rate=mortality_rate) %>% 
  select(-age)


inc_trans_ages_only <- inc_trans_ages %>% filter(inc_trans_possible) %>% select(age_grp) %>% c
## ensure there is a value for each possible transition (eg missing 20-25 5-1)
income_transitions_qol <- expand_grid(age_grp = unique(starting_cohorts$age_grp), 
                                      trans_from = unique(starting_cohorts$total_inc_qtile),
                                      total_inc_qtile = unique(starting_cohorts$total_inc_qtile)) %>% 
  left_join(income_transitions %>% 
              select(-(avn:total_inc_meanmean)) %>% 
              mutate(total_inc_qtile= as.integer(total_inc_qtile)) %>% 
              data.table %>% 
              ## make age group age at which they are AT transition, not 5 years prior
              mutate(age_grp = (as.integer(age_grp)+1) %>% factor(., levels=c(1:age_grps_n), labels=levels(starting_cohorts$age_grp), ordered=T),
                     trans_from = as.character(total_inc_qtile),
                     total_inc_qtile = factor(total_inc_qtile_lead5, levels=c(1:inc_grps), labels=levels(starting_cohorts$total_inc_qtile), ordered=T)) %>% 
              select(age_grp, trans_from, total_inc_qtile, avprob)
  ) %>% 
  filter(age_grp %in% unlist(inc_trans_ages_only)) %>% 
  mutate(avprob = ifelse(age_grp=="[20,25)" & trans_from==5 & total_inc_qtile==1, 0, avprob))


wealth_accum_params_qol_0 <- wealth_accum_params %>% 
  left_join(year_concord, by=c("year"="year_actual") ) %>% 
  select(-year, year=year.y, ho=homeowner) %>% 
  filter(year %in% projection_years_id) %>% 
  mutate(total_inc_qtile = factor(total_inc_qtile, levels=c(1:inc_grps), labels=levels(starting_cohorts$total_inc_qtile), ordered=T)) 
## copy the same values of homeowner==1 for homeowner = -1 (in the model, homeowner -1 means bought a home this year, 1 means bought a home in prev years)
wealth_accum_params_qol_1 <- wealth_accum_params_qol_0 %>% 
  filter(ho==1) %>% mutate(ho=-1)
wealth_accum_params_qol <- rbind(wealth_accum_params_qol_0, wealth_accum_params_qol_1)


## effect of inheritance on home transition probability applied in model source code (not here). 
## Keep this to no transfer probabilities for each bequest status
home_transition_prob_qol <- rbind(home_transition_prob %>% mutate(beqrec=0), ## inheritance not received
                                  home_transition_prob %>% mutate(beqrec=1), ## inheritance received this year
                                  home_transition_prob %>% mutate(beqrec=-1))  ## inheritance received in the past

## remove old objects
rm(income_tax_super_projection, mortality_rates, income_transitions, 
   wealth_accum_params, wealth_accum_params_qol_0, wealth_accum_params_qol_1,
   home_transition_prob)



# Get all possible income pathways by year -------------------------------------------------------------

## Code is flexible enough to handle changes to above parameters
## create empty rows for each possible income transition pathway, which will be bound to starting cohorts

## get all possible end pathways
end_pathways <- 0
for(i in 1:(inc_trans_n)) {
  add_path <- 1:inc_grps
  end_pathways <- expand_grid(end_pathways, add_path, .name_repair = "minimal")
}

names(end_pathways) <- paste0("T", c(0:inc_trans_n)) ## T for transition point

## get unique intermediate pathways at each transition point
inter_pathways <- lapply(names(end_pathways), function(x) {
  inter_paths <- end_pathways %>% 
    distinct(., across(names(.)[1]:x)) %>% 
    unite("inc_pathway", everything(), sep="", remove=T) 
}) %>% 
  setNames(names(end_pathways))

## years that income transitions happen
inc_trans_years <- data.frame(
  year=projection_years_id
) %>% 
  mutate(inc_trans_year = ifelse(year%%inc_trans_interval==0 & year!=0, 1, 0) %>% as.logical # every multiple of 5 is a inc_trans_year
  )


# Get all possible age, year, beqrec status and homeowner status combos, by starting cohort  -----------------------------------------

## Aim is to determine for each starting cohort, all the possible transition pathways for them (inc trans, beqrec trans, homeown trans)

## create a list of starting age groups / starting bequest recip / starting homeowner by year while still living, for each starting cohort

age_years_combo_bh <- expand_grid(age_grp_0 = unique(starting_cohorts$age_grp), 
                                  year = projection_years_id, ## current year of projection
                                  beqrec_0 = c(0,1), ## starting bequest received status
                                  ho_0 = c(0,1),  ## starting homeowner status
                                  beqrec = c(-1,0,1), ## you either have received a bequest or not. 0 = not received, 1 = received previously, -1 = received this year (which will be combined with 1 in final model results)
                                  ho = c(-1,0,1) ## you either are a homeowner or not. 0,1,-1 as above
) %>% 
  
  ## age the cohorts every 5 years
  mutate(age_increment = ceiling((year+1)/5)-1, ## gets increment relative to current age group (e.g. given the year, age group will be 3 factor levels higher than current age group)
         age_grp = (as.numeric(age_grp_0) + age_increment) %>% 
           ## add factor levels back
           factor(., levels=c(1:age_grps_n), labels=levels(age_grp_0), ordered=T)
  ) %>% 
  ## it is assumed that no one lives past 5 years of being in the [100-105] age grp. So any NAs of age grp means they are not alive in that year
  filter(!is.na(age_grp)) %>% 
  select(-age_increment) %>% 
  
  ## Additional limitations to the number of pathways:
  ## 0-15 year olds can't be homeowners - remove these
  filter(!(age_grp <= "[10,15)" & ho %in% c(-1,1))) %>% 
  ## transitions into homeownership not possible at certain ages - remove any transitions after this age
  mutate(homeown_possible = ifelse(age_grp %in% home_trans_ages$age_grp, 1, 0)) %>% 
  group_by(age_grp_0, beqrec_0, ho_0) %>% 
  mutate(homeown_possible_lifetime = sum(homeown_possible)) %>% 
  filter(!(homeown_possible_lifetime==0 & ho_0==0 & ho %in% c(-1,1))) %>% 
  select(-homeown_possible, -homeown_possible_lifetime) %>% 
  ## year 0 and beqrec and ho status must be consistent
  filter(!(year==0 & (beqrec_0!=beqrec | ho_0 != ho))) %>% 
  filter(!( (beqrec_0==1 & beqrec %in% c(-1,0)) | (ho_0==1 & ho%in% c(-1,0) ) )) %>% 
  ## no one can receive a bequest in year 1 because no one has died yet
  filter(!(year==1 & beqrec==-1) ) %>% 
  ## year 2 is first year of moving into beqrec (-1) and year 1 is first year of moving into home ownership (-1)
  filter(!(year<=2 & beqrec_0==0 & beqrec==1)) %>% 
  filter(!(year<=1 & ho_0==0 & ho==1)) %>% 
  ungroup

## determine how many income transitions have occured at this point for this starting cohort
age_years_combo_i <-  age_years_combo_bh %>% 
  distinct(age_grp_0, age_grp, year) %>% 
  ## get ages at which income transition is allowed by age
  left_join(inc_trans_ages) %>% 
  ## get years that inc transitions happen at every 5 years
  left_join(inc_trans_years) %>% 
  # number of transitions that have occured at this point for this starting cohort
  group_by(age_grp_0) %>% 
  mutate(inc_trans_num = cumsum(inc_trans_year*inc_trans_possible)) 

age_years_combo <- left_join(age_years_combo_bh, age_years_combo_i) 

## add all missing starting cohort identifiers (inc and wealth) and expand out
starting_cohorts_pathways <- expand_grid(age_years_combo, 
                                         inc_0=unique(starting_cohorts$total_inc_qtile), 
                                         wealth_0=unique(starting_cohorts$total_inc_qtile)) %>% 
  ## remove any combinations that aren't actually found in starting_cohorts
  mutate(starting_cohort = paste0("SA", age_grp_0, " SI", inc_0, " SW", wealth_0, " SB", beqrec_0, " SH", ho_0)) %>% 
  filter(starting_cohort %in% starting_cohorts$starting_cohort) 

rm(end_pathways, inc_trans_years, age_years_combo_bh, age_years_combo_i, age_years_combo)



# Attach all income pathways and starting cohort combos together -------------------

## using data.table for speed

gc()
## unique income pathways by starting age and year and b and h 
starting_cohorts_pathways <- starting_cohorts_pathways %>% 
  ## for each unique number of inc transitions the cohort is up to
  split(., .$inc_trans_num) %>% 
  lapply(., function(x) {
    ## repeat rows in x by inter_pathways rows, based on what transition number it is up to
    gc()
    inc_trans_num <- x[[1,"inc_trans_num"]]
    print(inc_trans_num)
    expand_grid(x, inter_pathways[[(inc_trans_num+1)]]) 
  }
  ) %>% 
  rbindlist  

## record current income quantile by year , and add factor levels back
starting_cohorts_pathways[, inc_pathway := paste0(as.character(inc_0), stri_sub(inc_pathway, 2, -1)) ]
starting_cohorts_pathways[, total_inc_qtile := as.numeric(stri_sub(inc_pathway, -1, -1)) ]
starting_cohorts_pathways[, total_inc_qtile := factor(total_inc_qtile, levels=c(1:inc_grps), labels=levels(starting_cohorts$total_inc_qtile), ordered=T)]


# Attach mortality rates, incomes, home rates, wealth params etc  --------

## attach mortality rates by age and inc group
## first create mortality_year variable to get correct mortality rates for the year. ie for first year the cohort is at that age group, use mrtality_year 1 rate. For second year, use mortality_year 2 rate, etc
## only one death rate for 100-105 yos, make mortality_year=1 to match
starting_cohorts_pathways[, mortality_year := ifelse(age_grp=="[100,105]", 1, year%%5 +1 )  ]
starting_cohorts_pathways <- merge(starting_cohorts_pathways, mortality_rates_qol, by=c("age_grp", "mortality_year"), all.x=T)
## edit mortality rate to 1 if age is 100-105 and it is the last time period for that age group (not including 32, the end of our projection period - ppl continue to live after that)
starting_cohorts_pathways[, death_rate := ifelse(age_grp=="[100,105]" & year==max(year) & year!=max(projection_years_id), 1, death_rate),
                          by=age_grp_0]
rm(mortality_rates_qol)

## attach projected incomes - merge with age concord to enable merging with inc projections
starting_cohorts_pathways <- merge(starting_cohorts_pathways, age_grp_concord, by="age_grp", all.x=T)
starting_cohorts_pathways <- merge(starting_cohorts_pathways, income_tax_super_projection_qol, by=c("age_grp2", "total_inc_qtile", "year"), all.x=T)

## merge in income and homeowner transition probability data - (bequest receipt prob done later)
starting_cohorts_pathways[, trans_from := ifelse(inc_trans_possible==T & inc_trans_year==T, 
                                                 stri_sub(inc_pathway, -2, -2),
                                                 NA)]
starting_cohorts_pathways <- merge(starting_cohorts_pathways, income_transitions_qol, by=c("age_grp", "trans_from", "total_inc_qtile"), all.x=T)
starting_cohorts_pathways[, avprob := ifelse(is.na(avprob), 1, avprob)] ## if missing an inc transition probability, make it 1 so that everyone from prev_n "transitions" in the formula below

starting_cohorts_pathways <- merge(starting_cohorts_pathways, home_transition_prob_qol, by=c("age_grp", "mortality_year", "total_inc_qtile", "beqrec"), all.x=T) 

## merge in wealth accum params
starting_cohorts_pathways <- merge(starting_cohorts_pathways, wealth_accum_params_qol, by=c("age_grp", "age_grp2", "total_inc_qtile", "ho", "year"), all.x=T)

## merge in gift giving rate
starting_cohorts_pathways <- merge(starting_cohorts_pathways, gifting_rates, by=c("age_grp"), all.x=T )

gc()




# Save data ---------------------------------------------------------------

## split into list by year - save for use in model running
year_list_0 <- starting_cohorts_pathways %>% 
  split(., .$year) 

## year list 0 with historic returns
qsave(year_list_0, "./Output data/year_list_0_historic.qs")


## Create base scenario year list by converging rates of return to 4% over 10 years

converge_rate <- 0.04
converge_years <- 10


year_list_0_converge <- year_list_0 %>% 
  ## adjust rate of return 
  lapply(., function(x) {
    x %>% mutate(housing_asset_return_param = ifelse(year %in% c(1:converge_years),
                                                     housing_asset_return_param-(housing_asset_return_param-converge_rate)/(converge_years-1)*(year-1),
                                                     converge_rate), 
                 housing_asset_saving_param = ifelse(housing_asset_saving_param>0,
                                                     ## reduce value of new purchases by degree of difference between this scenario and historic scenario returns
                                                     housing_asset_saving_param* housing_asset_return_param/0.07,
                                                     housing_asset_saving_param),
                 super_asset_return_param = ifelse(year %in% c(1:converge_years),
                                                   super_asset_return_param-(super_asset_return_param-converge_rate)/(converge_years-1)*(year-1),
                                                   converge_rate), 
                 other_asset_return_param = ifelse(year %in% c(1:converge_years),
                                                   other_asset_return_param-(other_asset_return_param-converge_rate)/(converge_years-1)*(year-1),
                                                   converge_rate)
    )
  })

## save year list for base scenario
qsave(year_list_0_converge, "./Output data/year_list_0_returns_converge.qs")


starting_cohorts_pathways_converge <- year_list_0_converge %>% rbindlist

## save the base scenario expanded pathways for future reference - used in step 4 scenario analysis params
qsave(starting_cohorts_pathways_converge, "./Output data/starting_cohorts_pathways_20211026.qs")


sapply(year_list_0, nrow)
gc()
